#!/usr/bin/env python3


#### How long was the user page when the editor made this edit?
##
## takes in postproc'd wikiq data and our samples in turn.
## matches each revision to a most-recent prior revision to the editor's user page -- sounds like a window function to me. but maybe not.
## adds the length of the user page as a new field
## saves out the new dataset. 
##
##have 3 samples? run it 3x!
##
##


import sys
# add pyspark to your python path e.g.
#sys.path.append("/home/nathante/sparkstuff/spark/python/pyspark")
#sys.path.append("/home/nathante/sparkstuff/spark/python/")
from pyspark import SparkConf
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import Window
import pyspark.sql.functions as f
from pyspark.sql import types
from pyspark.sql.types import IntegerType, StringType, DateType
import argparse
import glob
from os import mkdir
from os import path

#inputDir = ['/gscratch/comdata/users/kaylea/taboo/processed_data/euph/revData/', '/gscratch/comdata/users/kaylea/taboo/processed_data/coefs/taboo/revData/', '/gscratch/comdata/users/kaylea/taboo/processed_data/ngram/revData/'] #hacky, notice how this gets used below
inputDir = ['/gscratch/comdata/users/kaylea/taboo/processed_data/euph/revData/', '/gscratch/comdata/users/kaylea/taboo/processed_data/ngram/revData/'] #hacky, notice how this gets used below
output_dir = '/gscratch/comdata/users/kaylea/taboo/processed_data/witheuph_uniqueRegisteredUsersInSample/'



conf = SparkConf().setAppName("Unique Decoded Usernames")
spark = SparkSession.builder.getOrCreate()

#get userpage length info
reader = spark.read
temp = reader.csv(inputDir[0], sep='\t', inferSchema=True, header=True, mode="PERMISSIVE") #hacky ...referring by absolute position
tabooDF = temp.filter(temp.anon != "TRUE").select(['editor']).toPandas()
tabooUsers = list(tabooDF['editor'])

reader = spark.read #reset
temp = reader.csv(inputDir[1], sep='\t', inferSchema=True, header=True, mode="PERMISSIVE") 
ngramDF = temp.filter(temp.anon != "TRUE").select(['editor']).toPandas()
ngramUsers = list(ngramDF['editor'])

userList = list(set(ngramUsers + tabooUsers))

if not path.exists(output_dir):
    mkdir(output_dir)
with open(output_dir + 'usernameList.tsv', 'w') as fp:
	fp.write('encodedEditor\n')
	for user in userList:
		fp.write(f"{user}\n")

